Smoking Paper - Main

Genus level

Data preparation

> # from Bioconductor (install by running: BiocManager::install("x") | x = name of the package)
> # BiocManager::install(c("DESeq2", "phyloseq", "microbiome"))
> 
> #github functions
> # devtools::install_github("bryandmartin/corncob")
> # remotes::install_github("g-antonello/gautils")
> # remotes::install_github("mikemc/speedyseq")#
> # devtools::install_github("gaospecial/ggvenn")
> 
> library(gautils)
> library(magrittr) # imported separately from tidyverse to also import '%<>%'
> library(ggpubr)
> library(kableExtra)
> 
> # graphics packages
> # library(ggpubr)
> # library(magrittr)
> # # my functions
> # source("my_personal_functions.R")
> # 
> # 
> # library(speedyseq)
> theme_set(theme_light())
> 
> 
> chrismb_phy <- readRDS("/shared/statgen/microbiome/CHRISMB/processed_data/microbiome_tables/phyloseq_built.Rds")
> 
> threshold_prevalence <- 0.01
> threshold_detection <- 10

Download CHRIS data

These are based on data retrieval from the chrisData internal R package curated by J. Rainer. Variable selection is done via consulting the codebook.

> chrismb_phy <- readRDS("/shared/statgen/microbiome/CHRISMB/processed_data/microbiome_tables/phyloseq_built.Rds")
> 
> library(chrisData)
> 
> possible_data_files <- chrisDataFile()
> 
> selected_data_files <- c(
+     "general_information",
+     "household",
+     "clinical_traits",
+     "interview",
+     "labdata",
+     "substudies",
+     "drugs_summary"
+   )
> 
> data_selected.list <- sapply(selected_data_files, function(x) chrisData(paste0(x, ".txt.gz")), USE.NAMES = T)
> 
> data_selected.joined.df <- purrr::reduce(data_selected.list, full_join, by = "AID")
> data_selected.joined.df <- data_selected.joined.df[match(chrismb_phy@sam_data$AID, data_selected.joined.df$AID),]
> 
> # Columns with all NAs must be necessarily removed 
> all_NAs <- apply(data_selected.joined.df, 2, function(x) all(is.na(x)))
> data_selected.joined.df <- data_selected.joined.df[, !all_NAs]
> 
> # Columns of class "chron" are problematic, they must be removed to be able to work with phyloseq and metadata data.frame back and forth
> data_selected.joined_NODates.df <- select(data_selected.joined.df, -contains(c("x0bp08", "x0bc03", "x0bc06")))
> 
> rm(data_selected.list)
> 
> chrismb_phy <- phy_add_metadata_variables(chrismb_phy, df = data_selected.joined_NODates.df, by = "AID")
> 
> 
> levels(chrismb_phy@sam_data$x0oh01) <- c("0", "1-9", "10-19", "20+")
> levels(chrismb_phy@sam_data$x0sm51) <- c("Never", "Former", "Current (R)", "Current (NR)")
> chrismb_phy@sam_data$x0an03a %<>% as.numeric()
> 
> chrismb_phy@sam_data$x0_ager_cat <- cut(chrismb_phy@sam_data$x0_ager, breaks = c(18, 31, 41, 51, 61, 71, max(chrismb_phy@sam_data$x0_ager)+1), include.lowest = TRUE, labels = c("18-30", "31-40", "41-50", "51-60", "61-70", "71+"))
> 
> # create some househod (hid) info
> chrismb_phy <- relocate_sample_data(chrismb_phy, x0_ager_cat, .after = x0_ager) %>% 
+   mutate_sample_data(
+     hid = paste0("h", hid),
+     hid_size_chrismb = add_N_to_catVar(hid) %>% as.character() %>% gsub("h.*\\ ", "", .) %>% parse_number(),
+   smoking_exposure_ga = 
+     case_when(
+       grepl("Current", as.character(x0sm51)) ~ "Current",
+       TRUE ~ as.character(x0sm51)
+       ) %>% 
+     factor(levels = c("Never", "Former", "Current"))
+   ) %>% 
+   relocate_sample_data(smoking_exposure_ga, .before = x0sm51)

Smoking variables generation

Bin smoking intensity (g/day)

> ###############################
> ## bin daily tobacco smoked
> ###############################
> 
> chrismb_phy %<>%
+   mutate_sample_data(
+     ### tobacco g per day (bin 5 by 5)
+     x0sm61_bin5cont = case_when(
+       smoking_exposure_ga == "Current" & x0sm61 <= 1 ~ 1,
+       smoking_exposure_ga == "Current" & x0sm61 <= 5 ~ 5,
+       smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ 10,
+       smoking_exposure_ga == "Current" & x0sm61 <= 15 ~ 15,
+       smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ 20,
+       smoking_exposure_ga == "Current" & x0sm61 <= 25 ~ 25,
+       smoking_exposure_ga == "Current" & x0sm61 <= 30 ~ 30,
+       TRUE ~ NA
+     ),
+     
+     x0sm61_bin2cont = case_when(
+       smoking_exposure_ga == "Current" & x0sm61 <= 2 ~ 2,
+       smoking_exposure_ga == "Current" & x0sm61 <= 4 ~ 4,
+       smoking_exposure_ga == "Current" & x0sm61 <= 6 ~ 6,
+       smoking_exposure_ga == "Current" & x0sm61 <= 8 ~ 8,
+       smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ 10,
+       smoking_exposure_ga == "Current" & x0sm61 <= 12 ~ 12,
+       smoking_exposure_ga == "Current" & x0sm61 <= 14 ~ 14,
+       smoking_exposure_ga == "Current" & x0sm61 <= 16 ~ 16,
+       smoking_exposure_ga == "Current" & x0sm61 <= 18 ~ 18,
+       smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ 20,
+       smoking_exposure_ga == "Current" & x0sm61 <= 22 ~ 22,
+       smoking_exposure_ga == "Current" & x0sm61 <= 24 ~ 24,
+       smoking_exposure_ga == "Current" & x0sm61 <= 26 ~ 26,
+       smoking_exposure_ga == "Current" & x0sm61 <= 28 ~ 28,
+       smoking_exposure_ga == "Current" & x0sm61 <= 30 ~ 30,
+       smoking_exposure_ga == "Current" & x0sm61 <= 60 ~ 60,
+       TRUE ~ NA
+     ))
Error in `dplyr::mutate()`:
! Problem while computing `x0sm61_bin5cont = case_when(...)`.
Caused by error in `case_when()`:

Bin Years since quitting

> # x0sm40 = age when people quit or reduce
> # x0_ager = age rounded to closest integer
> 
> max_value <- max(chrismb_phy@sam_data$x0_ager - chrismb_phy@sam_data$x0sm40, na.rm = T)
> 
> chrismb_phy %<>% 
+   mutate_sample_data(
+     years_since_quitting_or_reducing = x0_ager - x0sm40,
+     
+     years_since_quitting_binned = case_when(
+       smoking_exposure_ga == "Former" & !is.na(years_since_quitting_or_reducing) ~ cut(years_since_quitting_or_reducing, 
+                                             breaks = c(0, 1, 3, 5, 10, max_value), 
+       include.lowest = TRUE
+     )
+   )
+ )
> 
> rm(max_value)

Ultimate smoking variable for plots: “smoking_detailed”

> max_years_since_quitting <- max(chrismb_phy@sam_data$years_since_quitting_or_reducing, na.rm = T)
> 
> chrismb_phy %<>%
+   mutate_sample_data(
+     smoking_detailed = case_when(
+       smoking_exposure_ga == "Never" ~ "Never",
+       
+       smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 1 ~ "Former 0-1 y",
+       smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 3 ~ "Former 2-3 y",
+       smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 5 ~ "Former 4-5 y",
+       smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= 10 ~ "Former 6-10 y",
+       smoking_exposure_ga == "Former" & years_since_quitting_or_reducing <= max_years_since_quitting ~ "Former 11-62 y",
+       smoking_exposure_ga == "Former" & is.na(years_since_quitting_or_reducing) ~ "Former NA y",
+       
+       smoking_exposure_ga == "Current" & x0sm61 < 2 ~ "Current < 2 g",
+       smoking_exposure_ga == "Current" & x0sm61 <= 3 ~ "Current 2-3 g",
+       smoking_exposure_ga == "Current" & x0sm61 <= 5 ~ "Current 4-5 g",
+       smoking_exposure_ga == "Current" & x0sm61 <= 10 ~ "Current 6-10 g",
+       smoking_exposure_ga == "Current" & x0sm61 <= 15 ~ "Current 11-15 g",
+       smoking_exposure_ga == "Current" & x0sm61 <= 20 ~ "Current 16-20 g",
+       smoking_exposure_ga == "Current" & x0sm61 > 20 ~ "Current > 20 g",
+       smoking_exposure_ga == "Current" & is.na(x0sm61) ~ "Current NA g",
+       TRUE ~ "Unknown"
+       ),
+     
+     smoking_detailed = factor(smoking_detailed, levels = unique(smoking_detailed)[c(14, 4, 10, 6, 3, 7, 12, 8, 11, 13, 5, 9, 2, 16, 1, 15)])
+     )

Available Samples for statistical analysis

> excluded_samples_upon_request <- readLines("/home/gantonello/CHRISMB/participants lists each study/withdrawnConsents_AIDs.txt")
> 
> chrismb_phy_core <- core(chrismb_phy, detection = threshold_detection, prevalence = threshold_prevalence)
> 
> phy_Q1 <- chrismb_phy_core %>% 
+   filter_sample_data(!(AID %in% excluded_samples_upon_request),
+                      !is.na(x0oh01),
+                      !is.na(x0_ager),
+                      !is.na(x0_sex),
+                      !is.na(x0sm51),
+                      #x0bc10 != 1, # people who smoked were reported as 1, NAs are treated as non-smoking
+                      x0dd51 == "No")
> 
> phy_Q1
phyloseq-class experiment-level object
otu_table()   OTU Table:          [ 622 taxa and 1601 samples ]:
sample_data() Sample Data:        [ 1601 samples by 1500 sample variables ]:
tax_table()   Taxonomy Table:     [ 622 taxa by 8 taxonomic ranks ]:
phy_tree()    Phylogenetic Tree:  [ 622 tips and 621 internal nodes ]:
refseq()      DNAStringSet:       [ 622 reference sequences ]
taxa are rows
> phy_Q1_genus <- phy_tax_glom(phy_Q1, "Genus")
> 
> 
> phy_Q1_meta <- meta(phy_Q1)
> dir.create("../results")

Table 1

Table shorter

> library(table1)
> 
> tbl1 <- phy_Q1_meta %>% 
+   # do some renaming, just for the Table output
+   dplyr::rename(
+     Sex = x0_sex,
+     `Age Category (years)` = x0_ager_cat, 
+     `N° Teeth (self-reported)` = x0oh01, 
+     `Gums Health (self-reported)` = x0oh02b
+   ) %>% 
+ # pipe this temporary data into the table 1
+   table1(~ Sex + `Age Category (years)` + `N° Teeth (self-reported)` + `Gums Health (self-reported)`| x0sm51, 
+        data = ., 
+        extra.col = list("X² p-value" = table1.pvalues),
+        caption = "Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index",
+        #overall = "CHRISMB"
+        )
> 
> tbl1
Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index
Never
(N=880)
Former
(N=395)
Current (R)
(N=100)
Current (NR)
(N=226)
Overall
(N=1601)
X² p-value
Sex
Male 356 (40.5%) 222 (56.2%) 58 (58.0%) 115 (50.9%) 751 (46.9%) 5.1e-07
Female 524 (59.5%) 173 (43.8%) 42 (42.0%) 111 (49.1%) 850 (53.1%)
Age Category (years)
18-30 238 (27.0%) 41 (10.4%) 38 (38.0%) 92 (40.7%) 409 (25.5%) 2.2e-17
31-40 139 (15.8%) 73 (18.5%) 18 (18.0%) 39 (17.3%) 269 (16.8%)
41-50 196 (22.3%) 75 (19.0%) 23 (23.0%) 41 (18.1%) 335 (20.9%)
51-60 144 (16.4%) 112 (28.4%) 12 (12.0%) 39 (17.3%) 307 (19.2%)
61-70 93 (10.6%) 57 (14.4%) 8 (8.0%) 15 (6.6%) 173 (10.8%)
71+ 70 (8.0%) 37 (9.4%) 1 (1.0%) 0 (0%) 108 (6.7%)
N° Teeth (self-reported)
0 50 (5.7%) 23 (5.8%) 7 (7.0%) 9 (4.0%) 89 (5.6%) 0.11
1-9 57 (6.5%) 41 (10.4%) 4 (4.0%) 16 (7.1%) 118 (7.4%)
10-19 117 (13.3%) 74 (18.7%) 13 (13.0%) 35 (15.5%) 239 (14.9%)
20+ 656 (74.5%) 257 (65.1%) 76 (76.0%) 166 (73.5%) 1155 (72.1%)
Gums Health (self-reported)
Excellent 45 (5.1%) 18 (4.6%) 4 (4.0%) 12 (5.3%) 79 (4.9%) 0.44
Very good 188 (21.4%) 79 (20.0%) 20 (20.0%) 44 (19.5%) 331 (20.7%)
Good 291 (33.1%) 124 (31.4%) 30 (30.0%) 57 (25.2%) 502 (31.4%)
Average 229 (26.0%) 84 (21.3%) 27 (27.0%) 72 (31.9%) 412 (25.7%)
Poor 47 (5.3%) 24 (6.1%) 9 (9.0%) 13 (5.8%) 93 (5.8%)
Very poor 6 (0.7%) 2 (0.5%) 3 (3.0%) 0 (0%) 11 (0.7%)
Missing 74 (8.4%) 64 (16.2%) 7 (7.0%) 28 (12.4%) 173 (10.8%)

Table more variables

> tbl2 <- phy_Q1_meta %>% 
+   # do some renaming, just for the Table output
+   dplyr::rename(
+     Sex = x0_sex,
+     `Age Category (years)` = x0_ager_cat, 
+     `N° Teeth (self-reported)` = x0oh01, 
+     `Gums Health (self-reported)` = x0oh02b,
+     `Systolic Blood Pressure (mmHg)` = x0bp01,
+     `Diastolic Blood Pressure (mmHg)` = x0bp02,
+     `Body Mass Index (kg/m²)` = x0an03a
+   ) %>% 
+ # pipe this temporary data into the table 1
+   table1(~ Sex + `Age Category (years)` + `N° Teeth (self-reported)` + `Gums Health (self-reported)` + `Systolic Blood Pressure (mmHg)` + `Diastolic Blood Pressure (mmHg)` + `Body Mass Index (kg/m²)`| x0sm51, 
+        data = ., 
+        extra.col = list("P-value" = table1.pvalues),
+        caption = "Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Abbreviations: mmHg = millimeters of mercury, kg = kilograms, m² square meters. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index",
+        #overall = "CHRISMB"
+        )
> 
> tbl2
Demographics of CHRISMB cohort, in South Tyrol, Italy, with respect to smoking habit. Per-column percentages were also reported in brackets. Current smokers were separated into smokers who reduced daily smoking dosage some time in the past - Current (R), and those who did not reduce - Current (NR). The whole cohort is included under the “CHRISMB” column. Abbreviations: mmHg = millimeters of mercury, kg = kilograms, m² square meters. Significance is calculated as X² test for categorical variables and as Kruskal-Wallis non-parametric analysis of variance for continuous variables (Blood pressure, Body Mass Index
Never
(N=880)
Former
(N=395)
Current (R)
(N=100)
Current (NR)
(N=226)
Overall
(N=1601)
P-value
Sex
Male 356 (40.5%) 222 (56.2%) 58 (58.0%) 115 (50.9%) 751 (46.9%) 5.1e-07
Female 524 (59.5%) 173 (43.8%) 42 (42.0%) 111 (49.1%) 850 (53.1%)
Age Category (years)
18-30 238 (27.0%) 41 (10.4%) 38 (38.0%) 92 (40.7%) 409 (25.5%) 2.2e-17
31-40 139 (15.8%) 73 (18.5%) 18 (18.0%) 39 (17.3%) 269 (16.8%)
41-50 196 (22.3%) 75 (19.0%) 23 (23.0%) 41 (18.1%) 335 (20.9%)
51-60 144 (16.4%) 112 (28.4%) 12 (12.0%) 39 (17.3%) 307 (19.2%)
61-70 93 (10.6%) 57 (14.4%) 8 (8.0%) 15 (6.6%) 173 (10.8%)
71+ 70 (8.0%) 37 (9.4%) 1 (1.0%) 0 (0%) 108 (6.7%)
N° Teeth (self-reported)
0 50 (5.7%) 23 (5.8%) 7 (7.0%) 9 (4.0%) 89 (5.6%) 0.11
1-9 57 (6.5%) 41 (10.4%) 4 (4.0%) 16 (7.1%) 118 (7.4%)
10-19 117 (13.3%) 74 (18.7%) 13 (13.0%) 35 (15.5%) 239 (14.9%)
20+ 656 (74.5%) 257 (65.1%) 76 (76.0%) 166 (73.5%) 1155 (72.1%)
Gums Health (self-reported)
Excellent 45 (5.1%) 18 (4.6%) 4 (4.0%) 12 (5.3%) 79 (4.9%) 0.44
Very good 188 (21.4%) 79 (20.0%) 20 (20.0%) 44 (19.5%) 331 (20.7%)
Good 291 (33.1%) 124 (31.4%) 30 (30.0%) 57 (25.2%) 502 (31.4%)
Average 229 (26.0%) 84 (21.3%) 27 (27.0%) 72 (31.9%) 412 (25.7%)
Poor 47 (5.3%) 24 (6.1%) 9 (9.0%) 13 (5.8%) 93 (5.8%)
Very poor 6 (0.7%) 2 (0.5%) 3 (3.0%) 0 (0%) 11 (0.7%)
Missing 74 (8.4%) 64 (16.2%) 7 (7.0%) 28 (12.4%) 173 (10.8%)
Systolic Blood Pressure (mmHg)
Mean (SD) 126 (14.3) 129 (15.6) 122 (11.7) 124 (12.0) 126 (14.3) 3e-05
Median [Min, Max] 124 [98.0, 203] 128 [99.0, 188] 121 [97.0, 157] 122 [100, 170] 124 [97.0, 203]
Missing 79 (9.0%) 37 (9.4%) 7 (7.0%) 15 (6.6%) 138 (8.6%)
Diastolic Blood Pressure (mmHg)
Mean (SD) 78.0 (8.72) 79.6 (9.48) 75.0 (7.62) 76.8 (8.30) 78.0 (8.86) 3.3e-05
Median [Min, Max] 77.0 [58.0, 115] 79.0 [58.0, 109] 74.0 [59.0, 98.0] 76.0 [58.0, 102] 77.0 [58.0, 115]
Missing 79 (9.0%) 37 (9.4%) 7 (7.0%) 15 (6.6%) 138 (8.6%)
Body Mass Index (kg/m²)
Mean (SD) 25.2 (4.36) 26.8 (4.59) 26.0 (4.69) 25.4 (4.17) 25.7 (4.46) 6.4e-08
Median [Min, Max] 24.5 [16.4, 48.3] 26.1 [18.1, 51.3] 25.1 [17.1, 43.0] 25.0 [18.2, 37.4] 25.1 [16.4, 51.3]

Figure 1 - Microbiota in relation to smoking status

Figure 1A

> div <- phyloseq::distance(microbiome::transform(phy_Q1_genus, "compositional"), "bray")
> 
> ord <- ordinate(physeq = phy_Q1_genus,
+                 distance = div,
+                 method = "PCoA"
+ )
> 
> fig1A <- plot_ordination(physeq = phy_Q1_genus,
+                          ordination = ord, 
+                          color = "smoking_exposure_ga")+ 
+   labs(title = NULL)+
+   theme_light()+
+   theme(legend.position = "bottom")+ 
+   scale_color_discrete(name = "Smoking habit",type = ggsci::pal_jco("default")(4)[c(1,2,4)])+ 
+   stat_ellipse(aes(group = smoking_exposure_ga), show.legend = FALSE)+
+   guides(color = guide_legend(override.aes = list(size=3)))
> 
> fig1A$labels$caption <- gsub(fig1A$labels$caption, pattern = "\n", replacement = "; ")
> fig1A$labels$x <- paste0("Principal Coordinate 1   ", strsplit(fig1A$labels$x, "   ")[[1]][[2]])
> fig1A$labels$y <- paste0("Principal Coordinate 2   ", strsplit(fig1A$labels$y, "   ")[[1]][[2]])

Figure 1B

> dir.create("../results/Figure1/DiffAbundGenera/", recursive = T)

Diff abund 1: DESeq2

> dir.create("../results/Figure1/DiffAbundGenera/DESeq2/", recursive = T)
> 
> if(!file.exists("../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")){
+ library(DESeq2)
+ #############################################################################
+ 
+ deseq_object_genus <-
+   phyloseq_to_deseq2(phy_Q1_genus, design = ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga)
+ 
+ geom_means <- apply(counts(deseq_object_genus), 1, function(x) exp(mean(log(x[x>0]))))
+ 
+ # estimate size factors, important to account for library size variations
+ estim_size <- estimateSizeFactors(deseq_object_genus, geoMeans = geom_means)
+ 
+ # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+ 
+ deseq2_results_genus <- DESeq(estim_size, fitType = "local")
+ 
+ 
+ saveRDS(deseq2_results_genus, "../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")
+ } else{
+   deseq2_results_genus <- readRDS("../results/Figure1/DiffAbundGenera/DESeq2/figure1_deseq2_dataframe.list.RDS")
+ }

Diff abund 2: LinDA

Diff abund 3: Maaslin2

Diff abund 4: ALDEx2 (glm)

> dir.create("../results/Figure1/DiffAbundGenera/ALDEx2/", recursive = T)
> 
> if (!file.exists("../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")) {
+   
+   library(ALDEx2)
+   # define model matrix, based on the metadata
+   model_mtx <-
+     model.matrix(
+       ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga,
+       data = dplyr::select(
+         meta(phy_Q1_genus),
+         x0oh01,
+         x0_sex,
+         x0_ager_cat,
+         smoking_exposure_ga
+       )
+     )
+   # perform the clr differential abundance
+   aldex.glm.genera_smoking_ga <-
+     aldex.clr(abundances(phy_Q1_genus),
+               model_mtx,
+               mc.samples = 200,
+               denom = "all", 
+               useMC = T
+               )
+   
+   # make glm, obtain significance of terms
+   glm.test_genera_smoking_ga <- aldex.glm(aldex.glm.genera_smoking_ga, model_mtx)
+   
+   # this step gets estimates effect sizes, it takes ages, like 30-60 minutes
+   glm.effects_genera_smoking_ga <- aldex.glm.effect(aldex.glm.genera_smoking_ga, include.sample.summary = T, verbose = FALSE, useMC = T)
+   
+   # aggregate these results into one data frame, that looks like lme4 output
+   aldex.results.genera <- list(clr.model = aldex.glm.genera_smoking_ga,
+                         glm.test = glm.test_genera_smoking_ga,
+                         eff_size.test = glm.effects_genera_smoking_ga)
+   
+   
+   saveRDS(aldex.results.genera, "../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")
+   
+ } else {
+   aldex.results.genera <- readRDS("../results/Figure1/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults_genera.list.RDS")
+ }

Diff abund 5: ANCOM-BC

> dir.create("../results/Figure1/DiffAbundGenera/ANCOM-BC/", recursive = T)
> 
> if (!file.exists("../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")) {
+ tmp <- mia::makeTreeSummarizedExperimentFromPhyloseq(phy_Q1_genus %>% select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga)) 
+ 
+ 
+ library(ANCOMBC)
+ 
+ # takes around 30 minutes
+ ancom_results <- ancombc2(data = tmp,
+         assay_name = "counts", 
+         tax_level = NULL,
+         fix_formula = "x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga", 
+         group = "smoking_exposure_ga",
+         struc_zero = TRUE,
+         iter_control = list(tol = 1e-5, max_iter = 20, verbose = FALSE),
+         em_control = list(tol = 1e-5, max_iter = 20),  
+         p_adj_method = "BH",
+         n_cl = 1,
+         alpha = 0.05)
+ 
+ saveRDS(ancom_results, "../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")
+ detach("package:ANCOMBC", unload = T)
+ }else{
+   
+   ancom_results <- readRDS("../results/Figure1/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults_genera.list.RDS")
+   
+ }

Summary of results

Results filtered by two conditions:

  1. FDR (5%) corrected \(q-value < 0.05\)
  2. The effect size, irrespectively of the way it is estimated, must be larger than 0.5

Since this value is rather arbitrary, one could devise a quantile cutoff, but it’s hard to decide where to cut, since we have no a priori expectations of the effect size distributions.

Consensus of methods, quick discussion

> ggarrange(ggarrange(plot_correlations_eff.sizes, 
+                     venn_plot+
+                       scale_x_continuous(expand = expansion(mult = .2)), 
+                     
+                     nrow = 2, ncol = 1, heights = c(1,0.5)
+                     ), 
+           
+           heatmap_plot, 
+           
+           nrow = 1, ncol = 2
+           ) %>% 
+   annotate_figure(top = text_grob("Discussion of differential abundance\nresults similarity across methods", size = 16, face = "bold", color = "red2"),
+                   bottom = text_grob(paste("NAs in the heatmap were arbitrarily set to", NA_value, "in order not to fail clustering,\nwhile still keeping evident track of where a method did not return a pathway as significant"), size = 10))

Figure 1C - Oxygen Metabolism

> dir.create("../results/Figure1/OxygenMetabolism/", recursive = T)
> 
> otutab_with_genus_name <- phy_Q1_genus %>%
+   microbiome::transform("compositional") %>%
+   abundances() %>% 
+   as.data.frame() %>% 
+   rownames_to_column("Genus") %>% # next step helps assigning the highest number of genera/families possible (because among the taxa in the table there are some families too) 
+   mutate(Genus_easier = sapply(strsplit(as.character(Genus), "_"), function(g) g[[1]]))
> 
> # get the metadata of the oxygen metabolism vs Genus
> oxygen_requirement_taxa <- read.csv("https://raw.githubusercontent.com/mcalgaro93/sc2meta/master/data/genera_methabolism.tsv", sep = "\t") %>% 
+   set_names(c("Genus_easier", "oxygen_requirements")) %>% 
+   mutate(oxygen_requirements = gsub("F ", "Facultative ", oxygen_requirements, fixed = T))
> 
> # merge otu table and oxygen requirement data by simplified genus name
> molten_df_merged_oxygen <- merge(otutab_with_genus_name, oxygen_requirement_taxa, by = "Genus_easier", all.x = TRUE)
> molten_df_merged_oxygen$oxygen_requirements <- ifelse(is.na(molten_df_merged_oxygen$oxygen_requirements), "NA",molten_df_merged_oxygen$oxygen_requirements)
> 
> # aggregate by oxygen metabolism, instead of genera, to make code easier
> 
> tmp <- molten_df_merged_oxygen %>% 
+   dplyr::select(!contains("Genus")) %>%
+   group_by(oxygen_requirements) %>% 
+   summarise_all(sum) %>%
+   column_to_rownames("oxygen_requirements") %>% 
+   t() %>% as.data.frame() %>% 
+   rownames_to_column("sample_name")
> 
> oxygen_molten_metadata <- tmp %>% reshape2::melt() %>% 
+   merge(., phy_Q1_meta, all.x = TRUE, by = "sample_name") %>% 
+   dplyr::rename(oxygen = variable)
> 
> saveRDS(oxygen_molten_metadata, "../results/Figure1/OxygenMetabolism/oxygen_with_metadata_df.RDS")
> library(rstatix)
> 
> signif <- filter(oxygen_molten_metadata, oxygen != "NA") %>%
+   mutate(aerobiosis = oxygen) %>% 
+   group_by(oxygen) %>% 
+   rstatix::wilcox_test(value ~ smoking_exposure_ga) %>% 
+   adjust_pvalue(method = "BH") %>% 
+   #add_significance("p.adj") %>% 
+   add_xy_position(x = "oxygen",step.increase = 0.05) %>% 
+   filter(p.adj < 0.05)
> 
> fig1C <- ggboxplot(data = filter(oxygen_molten_metadata, oxygen != "NA"),
+                    x = "oxygen", 
+                    y = "value", 
+                    fill = "smoking_exposure_ga") + 
+   ylab("Relative Abundance") + 
+   xlab("")+ 
+   scale_fill_discrete(name = "Smoking habit", type = ggsci::pal_jco()(4)[c(1,2,4)]) + 
+   stat_pvalue_manual(signif, tip.length = 0.01) + 
+   theme_light()+
+   theme(legend.position = "none")

Figure 1 Panel

> Fig1_ggpanel <- ggarrange(
+   ggarrange(fig1A, 
+             fig1C, 
+             
+             nrow = 2, ncol = 1, labels = c("A","C"), heights = c(1,0.8)
+             ), 
+   ggplot() + theme_void(),
+   fig1B, 
+   
+   nrow = 1, ncol = 3, labels = c("", "B", ""), widths = c(1,0.1,1)
+   )
> 
> ggsave(
+   plot = Fig1_ggpanel,
+   filename = "../results/Figure1/Figure 1.png",
+   height = 7,
+   width = 8,
+   units = "in",
+   dpi = 600
+ )
> 
> Fig1_ggpanel

Figure 2 - Current smokers, regression vs dosage per day

> phy_Q2 <- phy_Q1 %>% 
+   filter_sample_data(smoking_exposure_ga == "Current") %>%
+   filter_sample_data(!is.na(x0sm61)) %>%
+   filter_sample_data(x0sm61 < 60)
> 
> phy_Q2 <- subset_taxa(phy_Q2, taxa_sums(phy_Q2)>0)
> 
> phy_Q2_genus <- phy_tax_glom(phy_Q2, "Genus")
> 
> dir.create("../results/Figure2", recursive = TRUE, showWarnings = FALSE)
> 
> phy_Q2_genus
phyloseq-class experiment-level object
otu_table()   OTU Table:          [ 82 taxa and 308 samples ]:
sample_data() Sample Data:        [ 308 samples by 1500 sample variables ]:
tax_table()   Taxonomy Table:     [ 82 taxa by 8 taxonomic ranks ]:
phy_tree()    Phylogenetic Tree:  [ 82 tips and 81 internal nodes ]:
refseq()      DNAStringSet:       [ 82 reference sequences ]
taxa are rows

Regression

> if(!file.exists("../results/Figure2/deseq_smoking_regression_datafames.rds")){
+ deseq_raw_results_Q2 <- list()
+ for(gr in c("x0sm61_bin5cont", "x0sm61_bin2cont", "x0sm61")){
+   
+   # create deseq object
+   fla <- as.formula(paste0("~ x0_ager + x0_sex + x0oh01 + ", gr))
+   
+   phy_q2_genus_deseq_q2 <- phyloseq_to_deseq2(phy_Q2_genus, design = fla)
+   
+   # calculate geometric means, important to center the counts
+   
+   geom_means_q2 <- apply(counts(phy_q2_genus_deseq_q2), 1, function(x)
+       exp(mean(log(x[x > 0]))))
+   
+   # estimate size factors, important to account for library size variations
+   estim_size_q2 <- estimateSizeFactors(phy_q2_genus_deseq_q2, geoMeans = geom_means_q2)
+   
+   # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+   
+   deseq2_results_q2 <- DESeq(estim_size_q2, fitType="local")
+   deseq_raw_results_Q2[[gr]] <- deseq2_results_q2
+ }
+ 
+ taxonomy <- as.data.frame(tax_table(phy_Q2_genus))[c("Genus", "Phylum")]
+ q2_results <- lapply(deseq_raw_results_Q2, function(res) results(res) %>% 
+                        as.data.frame() %>% 
+                        rownames_to_column("Genus") %>%
+                        merge(taxonomy, by = "Genus", sort = FALSE, all.x = TRUE)
+ )
+ 
+ saveRDS(q2_results, "../results/Figure2/deseq_smoking_regression_datafames.rds")
+ rm(taxonomy)
+ 
+ } else{
+   
+   q2_results <- readRDS("../results/Figure2/deseq_smoking_regression_datafames.rds")
+ }

Figure 2A - Heatmap

Figure 2 B,C,D

> Fig2BCD <-
+   filter(
+     oxygen_molten_metadata,
+     oxygen != "NA" &
+       smoking_exposure_ga == "Current",
+     x0sm61 < 60
+   ) %>%
+   group_split(oxygen) %>%
+   lapply(
+     function(ox)
+       ggscatter(ox,
+                 x = "x0sm61",
+                 y = "value",
+                 alpha = 0.5,
+                 color = "gray30") +
+       geom_smooth(se = TRUE) +
+       theme_light()+
+       labs(
+         x = "Tobacco smoked per day (g)",
+         y = "Relative Abundance",
+         title = unique(ox$oxygen) 
+         )
+ )
> filter(
+     oxygen_molten_metadata,
+     oxygen == "Aerobic" &
+       smoking_exposure_ga == "Current",
+     x0sm61 < 60
+   ) %>% 
+   lm(value ~ I(1/x0sm61) + x0_ager + x0_sex + x0oh01, data = .) %>% 
+   summary() %>% 
+   broom::tidy() %>%
+   mutate(term = c("Intercept", "1/tobacco exposure intensity (g/day)", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>% 
+   kbl(caption = "The relative abundance of AEROBIC bacteria decreases with increasing grams of tobacco smoked in a day, up to 10 g/day. After 10 grams, it is no longer statistically associated (data not shown)", digits = 3) %>% 
+   kable_styling()
Table 1: The relative abundance of AEROBIC bacteria decreases with increasing grams of tobacco smoked in a day, up to 10 g/day. After 10 grams, it is no longer statistically associated (data not shown)
term estimate std.error statistic p.value
Intercept -0.007 0.012 -0.598 0.550
1/tobacco exposure intensity (g/day) 0.027 0.008 3.545 0.000
Age 0.000 0.000 1.579 0.115
Sex 0.005 0.004 1.322 0.187
1-9 teeth 0.004 0.011 0.331 0.741
10-19 teeth 0.024 0.009 2.571 0.011
20+ teeth 0.024 0.009 2.645 0.009

Figure 2 - Panel

> # Fig2_ggpanel <- ggarrange(
> #   ggplot()+ theme_void(),
> #   ggplotify::as.ggplot(Fig2A), 
> #   ggarrange(
> #     Fig2BCD[[1]], 
> #     Fig2BCD[[2]], 
> #     Fig2BCD[[3]], 
> #     
> #     nrow = 3, ncol = 1, labels = c("B", "C", "D")
> #     ), 
> #   
> #   nrow = 1, ncol = 3, widths = c(0.1, 1.5,1), labels = c("A",  "", "")
> #   )
> 
> Fig2_ggpanel <- ggarrange(
+   #ggplot()+ theme_void(),
+   ggplotify::as.ggplot(Fig2A), 
+   ggarrange(
+     Fig2BCD[[1]], 
+     Fig2BCD[[2]], 
+     Fig2BCD[[3]], 
+     
+     nrow = 3, ncol = 1, labels = c("B", "C", "D")
+     ), 
+   
+   nrow = 1, ncol = 2, 
+   widths = c(1.5,1), 
+   labels = c("A", "")
+   )
> 
> 
> ggsave(
+   plot = Fig2_ggpanel,
+   filename ="../results/Figure2/Figure 2.png",
+   height = 7,
+   width = 8,
+   units = "in",
+   dpi = 600
+ )
> 
> Fig2_ggpanel

Figure 3 - Former smokers, regression vs years since quitting

> dir.create("../results/Figure3/")
> phy_Q3 <- filter_sample_data(phy_Q1, 
+                              smoking_exposure_ga == "Former", 
+                              !is.na(years_since_quitting_or_reducing), 
+                              x0sm40 >= x0sm34 # meaning: (age when they quit) >= (age when they started)
+                              )
> 
> phy_Q3 <- subset_taxa(phy_Q3, taxa_sums(phy_Q3) > 0)
> 
> phy_Q3@sam_data$years_since_quitting_or_reducing %<>% as.numeric()
> 
> phy_Q3_genus <- phy_tax_glom(phy_Q3, "Genus")
> 
> phy_Q3
phyloseq-class experiment-level object
otu_table()   OTU Table:          [ 622 taxa and 369 samples ]:
sample_data() Sample Data:        [ 369 samples by 1500 sample variables ]:
tax_table()   Taxonomy Table:     [ 622 taxa by 8 taxonomic ranks ]:
phy_tree()    Phylogenetic Tree:  [ 622 tips and 621 internal nodes ]:
refseq()      DNAStringSet:       [ 622 reference sequences ]
taxa are rows

Regression calculations and summary table

> library(DESeq2)
> 
> phy_q3_genus_deseq <- phyloseq_to_deseq2(phy_Q3_genus, design = ~ x0_ager_cat + x0_sex + x0oh01 + years_since_quitting_or_reducing)
> 
> # calculate geometric means, important to center the counts
> geom_means_q3 <- apply(counts(phy_q3_genus_deseq), 1, function(x)
+       exp(mean(log(x[x > 0]))))
> 
> # estimate size factors, important to account for library size variations
> estim_size_q3 <- estimateSizeFactors(phy_q3_genus_deseq, geoMeans = geom_means_q3)
> 
> # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
> 
> deseq2_results_q3 <- DESeq(estim_size_q3, fitType="local")
> 
> results_q3 <- results(deseq2_results_q3, name = "years_since_quitting_or_reducing")
> 
> results_q3.df <- as.data.frame(results_q3)
> 
> DT::datatable(arrange(results_q3.df, padj) %>% mutate_if(is.numeric, round, 3), caption = "Table 'Years since quitting' | Results of regression of microbial genera in response to the years since quitting smoking. The model was corrected for age, sex and number of teeth, nominal P-values correceted for False Discovery Rate with the Benjamini-Hochberg method.")

Figure 3A - Heatmap

Figure 3 B,C,D

> fig3BCD <-
+   filter(
+     oxygen_molten_metadata,
+     oxygen != "NA" &
+       sample_name %in% sample_names(phy_Q3_genus) &
+     years_since_quitting_or_reducing <= 40
+   ) %>%
+   group_split(oxygen) %>%
+   lapply(
+     function(ox)
+       ggscatter(ox,
+                 x = "years_since_quitting_or_reducing",
+                 y = "value",
+                 alpha = 0.5
+       ) +
+       geom_smooth(se = TRUE) +
+       xlab("Years since quitting smoking") +
+       ylab("Relative Abundance") +
+       ggtitle(unique(ox$oxygen)) +
+       
+       theme_light()+geom_point(alpha = 0.1, color = "gray50")
+   )
> filter(
+     oxygen_molten_metadata,
+     oxygen == "Aerobic" &
+         sample_name %in% sample_names(phy_Q3_genus) &
+         years_since_quitting_or_reducing <= 20) %>% 
+     lm(value ~ years_since_quitting_or_reducing + x0_ager + x0_sex + x0oh01, data = .) %>% 
+   summary() %>% 
+   broom::tidy() %>%
+   mutate(term = c("Intercept", "Years since quitting", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>% 
+   kbl(caption = "The relative abundance of AEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)", digits = 3) %>% 
+   kable_styling()
Table 2: The relative abundance of AEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)
term estimate std.error statistic p.value
Intercept 0.024 0.024 0.995 0.321
Years since quitting 0.001 0.001 1.944 0.053
Age 0.000 0.000 1.022 0.308
Sex 0.002 0.006 0.359 0.720
1-9 teeth 0.029 0.017 1.703 0.090
10-19 teeth 0.025 0.017 1.494 0.137
20+ teeth 0.020 0.016 1.254 0.211
> filter(
+     oxygen_molten_metadata,
+     oxygen == "Anaerobic" &
+         sample_name %in% sample_names(phy_Q3_genus) &
+         years_since_quitting_or_reducing <= 20) %>% 
+     lm(value ~ years_since_quitting_or_reducing + x0_ager + x0_sex + x0oh01, data = .) %>% 
+   summary() %>% 
+   broom::tidy() %>%
+   mutate(term = c("Intercept", "Years since quitting", "Age", "Sex", "1-9 teeth", "10-19 teeth", "20+ teeth")) %>% 
+   kbl(caption = "The relative abundance of ANAEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)", digits = 3) %>% 
+   kable_styling()
Table 3: The relative abundance of ANAEROBIC bacteria increases with increasing years since quitting, up to 20 years. After 20 years, it is no longer statistically associated (data not shown)
term estimate std.error statistic p.value
Intercept 0.704 0.050 14.095 0.000
Years since quitting -0.001 0.001 -0.691 0.491
Age -0.001 0.001 -2.212 0.028
Sex -0.016 0.012 -1.309 0.192
1-9 teeth -0.025 0.036 -0.685 0.494
10-19 teeth -0.003 0.035 -0.090 0.929
20+ teeth 0.011 0.034 0.313 0.755

Figure 3 - Panel

> Fig3_ggpanel <- ggarrange(
+   ggplotify::as.ggplot(fig3A), 
+   ggarrange(fig3BCD[[1]], fig3BCD[[2]], fig3BCD[[3]], 
+             
+             nrow = 3, ncol = 1, labels = c("B", "C", "D")
+   ), 
+   
+   nrow = 1, ncol = 2, widths = c(1.5,1), labels = c("A", "")
+   )
> 
> 
> ggsave(
+   plot = Fig3_ggpanel,
+   filename = "../results/Figure3/Figure 3.png",
+   height = 7,
+   width = 8,
+   units = "in",
+   dpi = 600
+ )
> 
> Fig3_ggpanel

Figure 4 - Pathways differential abundance

PICRUSt2 input data were generated using the full CHRISMB dataset filtered for prevalence and detection (resp. 1% and 10 counts), which had 623 features for 1928 samples. I though I would do this for all samples once and for all, the set of ASVs doesn’t change if I pick the smoking paper subset or the CHRISMB full one. The discriminative part is the set of ASVs.

Results are stored in /home/gantonello/CHRISMB/PICRUSt2_prediction_v2.5/ subdivided into stratified and unstratified. the R script to generate the input data is in each of those two base directories.

Then I imported the two informative datasets:

  • EC codes per sample (like looking at single enzyme level)
  • pathway abundances (like looking at the aggregated enzymes)

HOW DOES PICRUSt2 ASSIGN EC AND PATHWAYS? It has some mapping files inside the environment.

If you run the picrust_pipeline.py with default settings, PICRUSt2 will take information from:

/home/gantonello/miniconda3/envs/picrust2/lib/python3.8/site-packages/picrust2/ And from there it will:

  1. Assign the ASVs reference seqs to the closest bacterial genome
  2. If the distance from the closest annotated reference genome is less than 2, it assumes identity and will assign a set of potential functions using ./default_files/prokaryotic/ec.txt.gz
  3. EC numbers are converted into enzyme names with ./default_files/pathway_mapfiles/ec_level4_to_metacyc_rxn.tsv
  4. these enzyme names are assigned to pathway using ./default_files/pathway_mapfiles/metacyc_path2rxn_struc_filt_pro.txt

Then the names get nicer names with a simple, final function, and that’s what I imported for the analysis.

Let’s start with pathways, correcting for age category, sex, and N° teeth, as the rest of the paper draft

> dir.create("../results/Figure4/", showWarnings = F)
> picrust_pathways_matrix <- data.table::fread("/home/gantonello/CHRISMB/PICRUSt2_prediction_v2.5/unstratified/picrust2_output/pathways_out/path_abun_unstrat_descrip.tsv.gz") %>% 
+   dplyr::select(-pathway) %>% 
+   column_to_rownames("description") %>% 
+   as.matrix()
> 
> picrust_pathways_matrix %<>% round(0)
> 
> picrust_pathways_matrix_Q1 <- picrust_pathways_matrix[!grepl("super", rownames(picrust_pathways_matrix)),sample_names(phy_Q1)]
> 
> #create a map data frame for pathway names
> 
> pathway_names_map_ALL <- data.frame(original_name = rownames(picrust_pathways_matrix_Q1),
+                                 fixed_name = make.names(rownames(picrust_pathways_matrix_Q1)))

Diff abund 1: DESeq2

> dir.create("../results/Figure4/DiffAbundGenera/DESeq2/", recursive = T)
> 
> if(!file.exists("../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")){
+   
+ library(DESeq2)
+ #############################################################################
+ 
+ deseq_object_picrust <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = TRUE), sample_data(phy_Q1))  %>% 
+   core(detection = 10, prevalence = 0.05) %>% 
+   phyloseq_to_deseq2(design = ~ x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga)
+ 
+ geom_means <- apply(counts(deseq_object_picrust), 1, function(x) exp(mean(log(x[x>0]))))
+ 
+ # estimate size factors, important to account for library size variations
+ estim_size <- estimateSizeFactors(deseq_object_picrust, geoMeans = geom_means)
+ 
+ # compute the DESeq2 glm, accounting for the estimate size factor and geometric means
+ 
+ deseq2_results_picrust <- DESeq(estim_size, fitType = "local")
+ 
+ saveRDS(deseq2_results_picrust, "../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")
+ } else{
+   deseq2_results_picrust <- readRDS("../results/Figure4/DiffAbundGenera/DESeq2/figure4_deseq2_dataframe.list.RDS")
+ }

Diff abund 2: LinDA

Diff abund 3: Maaslin2

Diff abund 4: ALDEx2 (glm)

> dir.create("../results/Figure4/DiffAbundGenera/ALDEx2/", recursive = T)
> 
> if (!file.exists("../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")) {
+   
+   library(ALDEx2)
+   
+   model_mtx <-
+     model.matrix(
+       ~  x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga,
+       data = dplyr::select(
+         phy_Q1_meta,
+         x0oh01,
+         x0_sex,
+         x0_ager_cat,
+         smoking_exposure_ga
+       ) %>% .[intersect(colnames(picrust_pathways_matrix_Q1),
+                         rownames(phy_Q1_meta)), ]
+     )
+   
+   aldex.glm.pathway_smoking_ga <-
+     picrust_pathways_matrix_Q1[, intersect(colnames(picrust_pathways_matrix_Q1),
+                                                      rownames(phy_Q1_meta))] %>% 
+     aldex.clr(
+               model_mtx,
+               mc.samples = 150,
+               denom = "all", 
+               useMC = T
+               )
+   
+   glm.test_pathway_smoking_ga <-
+     aldex.glm(aldex.glm.pathway_smoking_ga, model_mtx)
+   
+   glm.effects_pathway_smoking_ga <-
+     aldex.glm.effect(aldex.glm.pathway_smoking_ga, verbose = FALSE, include.sample.summary = T, useMC = T)
+   
+   aldex.results <- list(clr.model = aldex.glm.pathway_smoking_ga,
+                         glm.test = glm.test_pathway_smoking_ga,
+                         eff_size.test = glm.effects_pathway_smoking_ga)
+   
+   
+   saveRDS(aldex.results, "../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")
+ 
+   detach("package:ALDEx2", unload = T)
+   detach("package:zCompositions", unload = T)
+   
+ } else{
+   aldex.results <- readRDS("../results/Figure4/DiffAbundGenera/ALDEx2/ALDEx2.DA.rawResults.list.RDS")
+ }

Diff abund 5: ANCOM-BC

> dir.create("../results/Figure4/DiffAbundGenera/ANCOM-BC/", recursive = T)
> 
> if (!file.exists("../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")) {
+ tmp <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = T), sample_data(phy_Q1_genus %>%
+   select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga) %>% meta())) %>%
+   select_sample_data(x0_sex, x0_ager_cat, x0oh01, smoking_exposure_ga) %>% 
+   core(detection = 10, prevalence = 0.05) %>%
+   mia::makeTreeSummarizedExperimentFromPhyloseq()
+ 
+ 
+ library(ANCOMBC)
+ 
+ ancom_results_picrust <- ancombc2(data = tmp,
+         assay_name = "counts", 
+         tax_level = NULL,
+         fix_formula = "x0_sex + x0_ager_cat + x0oh01 + smoking_exposure_ga", 
+         group = "smoking_exposure_ga",
+         struc_zero = TRUE,
+         iter_control = list(tol = 1e-5, max_iter = 20, verbose = FALSE),
+         em_control = list(tol = 1e-5, max_iter = 20),  
+         p_adj_method = "BH",
+         n_cl = 1,
+         alpha = 0.05)
+ 
+ saveRDS(ancom_results_picrust, "../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")
+ 
+ }else{
+   
+   ancom_results_picrust <- readRDS("../results/Figure4/DiffAbundGenera/ANCOM-BC/ANCOM-BC.DA.rawResults.list.RDS")
+   
+ }

Summary of results

Results filtered by two conditions:

  1. FDR (5%) corrected \(q-value < 0.05\)
  2. The effect size, irrespectively of the way it is estimated, must be larger than 0.5
  • Relative Abundance
  • Mean per smoking group
  • Z-scaled for plotting
> if(!exists("pathways_consensus")){
+   pathways_consensus <- readRDS("../results/Figure4/significant_PWYs_consensus.RDS")
+ }
> 
> dummy_tax_table <- data.frame(pathway = rownames(picrust_pathways_matrix_Q1)) %>% set_rownames(rownames(picrust_pathways_matrix_Q1)) %>% as.matrix()
> 
> 
> pathways_summarized_plot <- phyloseq(otu_table(picrust_pathways_matrix_Q1, taxa_are_rows = T),
+          sample_data(phy_Q1_meta), tax_table(dummy_tax_table)) %>% 
+   phy_transform("compositional") %>% 
+   filter_sample_data(sample_name %in% c(sample_names(phy_Q2), sample_names(phy_Q3)) | smoking_exposure_ga == "Never") %>% 
+   filter_tax_table(taxa_names(.) %in% pathways_consensus) %>% 
+   mutate_sample_data(smoking_detailed = add_N_to_catVar(smoking_detailed)) %>%
+   phy_summarise_taxa_by_metadata(metadata_var = "smoking_detailed", .fun = mean)
> 
> ggsave(plot = ggplotify::as.ggplot(pheatmap::pheatmap(pathways_summarized_plot, scale = "none", main = "Unscaled Rows", cluster_cols = FALSE, angle_col = 315)), 
+        filename = "../results/Figure4/Figure 4 picrust_unscaledRows.png", dpi = 600, height = 6, width = 8)

> # save also scaled rows for potential alternative use
> 
> ggsave(plot = ggplotify::as.ggplot(pheatmap::pheatmap(pathways_summarized_plot, scale = "row", main = "Scaled Rows", cluster_cols = FALSE, angle_col = 315)), 
+        filename = "../results/Figure4/Figure4 picrust_scaledRows.png", dpi = 600, height = 6, width = 8)

Interesting note, the CLR transformation is dependent somehow on the number of features in the table, meaning that if you subset before creating the phyloseq object to make the transformation, you end up with a correct transformation, but scaled only on those pathways you selected. In my case, this approach was masking a few signals, so my advice is:

  • create a phyloseq object with all features (taxa, pathways, Kegg Orthology enzymes …) (optional, I use it because the transformaton functions are implemented)
  • re-convert into data.frame with either speedyseq::psmelt() or phy_OtuMetaTable()
  • make all the plots you want

Notice that many pathways are actually highly correlated, like the quinols. let’s see this in a heatmap

> signif_pathways_names <- readRDS("../results/Figure4/significant_PWYs_consensus.RDS")
> 
> picrust_pathways_matrix_Q1[signif_pathways_names,] %>% 
+   apply(2, function(s) s/sum(s)) %>% 
+   t() %>% 
+   stats::cor(method = "spearman") %>% 
+   pheatmap::pheatmap(show_colnames = FALSE, main = "Spearman correlation between significant pathways")

> ggarrange(ggarrange(plot_correlations_eff.sizes, 
+                     venn_plot+
+                       scale_x_continuous(expand = expansion(mult = .2)), 
+                     
+                     nrow = 2, ncol = 1, heights = c(1,0.5)
+                     ), 
+           
+           heatmap_plot, 
+           
+           nrow = 1, ncol = 2
+           ) %>% 
+   annotate_figure(top = text_grob("Discussion of differential abundance\nresults similarity across methods", size = 16, face = "bold", color = "red2"),
+                   bottom = text_grob(paste("NAs in the heatmap were arbitrarily set to", NA_value, "in order not to fail clustering,\nwhile still keeping evident track of where a method did not return a pathway as significant"), size = 10))

From these results we find out that effect sizes and the number o ALDEx2 and ANCOM-BC are further away, and that LinDA, despite being considered by authors as very similar to ANCOM-BC, in our results it seems more similar to MaAsLin2, with the added value of being extremely fast.